# First, we will set a CRAN 'mirror'. This directs R to install the following packages from a certain CRAN repository ('repo'). Normally, R automatically sets the CRAN repo, but sometimes using .Rmd files (instead of .R files) can cause problems.
local({r <- getOption("repos")
r["CRAN"] <- "https://cran.r-project.org"
options(repos=r)
})
# Instead of doing this:
library("dplyr")
library("tidyr")
library("ggplot2")
# etc.
# We will create a list of desired packages and pass it to the `library()` command. Any packages we don't have yet will be automatically installed. Then, R will check to make sure that all the packages were loaded. This saves us from a ton of typing, and from having to read all of the feedback. Check Lesson 1 for more information.
# Make the list of desired packages
list.of.packages <-
c(
"dplyr",
"tidyr",
"ggplot2",
"conflicted",
"here",
"plotly",
"emdbook",
"ROCR" #!!!! This may not be needed
)
# Check if any of the desired packages aren't installed
new.packages <-
list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
# Install any missing packages
if (length(new.packages))
install.packages(new.packages)
## package 'bdsmatrix' successfully unpacked and MD5 sums checked
## package 'caTools' successfully unpacked and MD5 sums checked
## package 'bbmle' successfully unpacked and MD5 sums checked
## package 'gplots' successfully unpacked and MD5 sums checked
## package 'emdbook' successfully unpacked and MD5 sums checked
## package 'ROCR' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\miran\AppData\Local\Temp\RtmpcNvIsK\downloaded_packages
# Load packages into this R session
packages_load <-
lapply(list.of.packages, require, character.only = TRUE)
# Print a warning if there is a problem with installing/loading some of packages
if (any(as.numeric(packages_load) == 0)) {
warning(paste("Package/s: ", paste(list.of.packages[packages_load != TRUE], sep = ", "), "not loaded!"))
} else {
print("All packages were successfully loaded.")
}
rm(list.of.packages, new.packages, packages_load)
# Resolve conflicts (identical commands with different functions) between packages.
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
conflict_prefer("layout", "plotly")
# If installation is not working, try changing repos. CRAN repos can be found here: https://cran.r-project.org/mirrors.html
The sample data is collected to be used for estimating the population characteristics (ie. level of disease in the field). Since it is only a sample of that population there is an inherent uncertainty tied to these estimates. The level of uncertainty can can be controlled by collecting reliable sample size. A reliable sample size is, in this context, the one which will provide the similar answer if the sampling is to be randomly repeated.
A coefficient of variation(CV), is a way to measure how spread out values are in a data set relative to the mean. It is calculated as:
CV = σ / μ
where: σ: The standard deviation of dataset
μ: The mean of dataset
The coefficient of variation is simply the ratio between the standard deviation and the mean.
n_plants <- 50 # The total number of plants
infected <- 20 # Number of infected plants
(incidence <- infected /n_plants) # Disease incidence
## [1] 0.4
(var <- (incidence* (1 - incidence))/n_plants) # Variance
## [1] 0.0048
(cv <- sqrt(var)/incidence) # Coefficient of variation
## [1] 0.1732051
cv * 100 # Often expressed in percentages
## [1] 17.32051
CV values in a range of around 0.1 or 0.2 (10% to 20%) are normally
considered satisfactory in for the field plant disease studies.
Reliability of estimated sample using Half-width of the required
confidence interval (normal distribution)
(1.96*sqrt(var))/incidence # H: Half-width of the required confidence interval
## [1] 0.339482
1.96*sqrt(var) #
## [1] 0.1357928
gg <-
expand.grid(
N0 = seq(5,50, by = 5),
M = seq(100, 4000,50 )
) %>%
as_tibble() %>%
mutate(N = round(N0/(1+(N0/M)),2)
) %>%
arrange(N0) %>%
ggplot(aes(M, N, group = N0,color= N0))+
geom_point()+
geom_line(aes(M, N, group = N0,color= N0))+
geom_point(size = .4)+
scale_color_viridis_c()+
scale_y_continuous(name="N", limits=c(0, 50), breaks = seq(5,50, by = 5))+
theme_bw()+
theme(panel.grid.major.y = element_line(color = "gray",
size = 0.5,
linetype = 2))
gg
Package plotly is employed here to produce interactive
plots. Try hovering over the graph to determine the exact value of each
point. Notice the interactive features in the upper right corner.
plotly::ggplotly(gg) %>%
plotly::layout(legend = list(orientation = "h", x = 0.4, y = -0.2))
only with finite population correction
Please note: * While each function can be used to illustrate conceptually sample size, it is important to take into account appropriate knowledge of the pathosystem of interest and conduct pilot studies or consult the literature to determine appropriate information
EstNfcf <- function (M, mean, CV) {
return((1 - mean) / (mean * CV ^ 2))
}
EstFcf <- function (M, mean, CV) {
Nest <- (1 - mean) / (mean * CV ^ 2)
finite <- Nest / M
Nsample <- ifelse(finite > 0.1,
Nest / (1 + (Nest / M)),
Nest)
return(round(Nsample, digits = 1))
}
NestM <- function (M, mean, CV) {
Nest <- (1 - mean) / (mean * CV ^ 2)
finite <- Nest / M
return(round(finite, digits = 2))
}
gg <-
as.data.frame(expand.grid(
mean =seq(.1, .9, .1),
CV =seq(.05, .3, .01),
M = c(100, 500, 1000)
) ) %>%
mutate(
est_nfcf = EstNfcf(M,mean,CV),
est_fcf = EstFcf(M,mean,CV),
nest_m = NestM(M,mean,CV)
) %>%
#Multiplication factor(in brackets) to make the differences obvious on the plot
mutate("NEST/M(X100)" = nest_m *100,
"Estimated-FCF(X5)" = est_fcf*5,
"Estimated-NFCF" = est_nfcf) %>%
pivot_longer(cols = c("Estimated-NFCF", "Estimated-FCF(X5)", "NEST/M(X100)"),
values_to = "sample_size") %>%
ggplot(aes(CV, sample_size, group = mean, color = mean))+
geom_point(size = .5)+
geom_line()+
scale_color_viridis_c()+
facet_grid(M~name)+
theme_bw()+
theme(legend.position = "top")
plotly::ggplotly(gg) %>%
plotly::layout(legend = list(orientation = "h", x = 0.4, y = -0.2))
This code illustrates a situation in which the pattern of disease incidence at that quadrant scale is random. We will draw a sample from a binomial distribution that has the following conditions:
Number of quadrants = 300 (for our example, this would represent the population and we will use a sample as a pilot study to illustrate calculations)
Probability of success (i.e., disease) = 0.25
Number of trials per quadrant (plants) = 10
set.seed(101) enables the reproduction of the example; if desired, this can be changed to simulate different samples
set.seed(101)
field1<-rbinom(n=300, size=10, prob=0.25)
The following code illustrates creating a “field”. Mathematically, we are creating a matrix.
field1.matrix<-matrix(field1,nrow=15,ncol=20)
field1.matrix
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 2 3 2 1 4 2 3 2 2 2 1 7 4
## [2,] 0 4 2 1 4 0 4 1 1 2 2 2 3
## [3,] 3 1 1 4 3 1 3 0 1 5 3 4 2
## [4,] 3 2 1 3 6 3 1 3 3 2 5 0 2
## [5,] 2 0 2 1 4 5 1 6 2 3 1 3 2
## [6,] 2 3 4 1 3 3 4 1 4 5 1 3 0
## [7,] 3 5 1 2 0 3 2 0 3 2 3 6 1
## [8,] 2 1 4 3 2 3 1 3 2 5 3 2 1
## [9,] 3 3 0 3 4 6 1 4 3 2 2 3 3
## [10,] 3 5 5 2 5 0 1 4 1 4 4 4 1
## [11,] 4 4 2 4 1 5 1 6 3 1 5 6 1
## [12,] 3 1 2 3 5 2 0 4 2 2 3 3 0
## [13,] 3 2 3 2 3 2 2 2 2 3 4 4 2
## [14,] 5 2 2 3 4 1 4 2 2 0 3 5 2
## [15,] 2 3 2 3 4 0 3 3 3 2 2 3 4
## [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 2 2 1 2 3 0 2
## [2,] 3 4 5 4 5 3 3
## [3,] 4 3 1 1 4 5 3
## [4,] 1 2 4 1 1 2 3
## [5,] 2 0 4 0 1 5 2
## [6,] 3 3 1 2 1 2 2
## [7,] 3 2 2 2 3 5 6
## [8,] 2 3 2 1 3 3 3
## [9,] 1 5 2 3 2 2 4
## [10,] 6 0 1 2 2 1 2
## [11,] 1 2 3 1 4 1 2
## [12,] 4 2 2 2 0 1 2
## [13,] 1 2 2 1 3 2 5
## [14,] 3 2 4 4 4 0 2
## [15,] 2 2 2 2 3 1 1
map1_long <-
reshape2::melt(field1.matrix, varnames = c("rows", "cols"))
# See first rows onf melted matrix
head(map1_long)
## rows cols value
## 1 1 1 2
## 2 2 1 0
## 3 3 1 3
## 4 4 1 3
## 5 5 1 2
## 6 6 1 2
point_size <- 11
map1_long %>%
ggplot(aes(factor(cols), factor(rows),
color = value,
label = as.character(value)))+
geom_point(size = point_size, shape = 15)+
geom_text(colour = "black")+
scale_color_gradient("Health status (counts of infected plants):",
low = "lightgreen", high = "#993300")+
theme_bw()+
theme(axis.title = element_blank(),
legend.position = "top",
panel.background = element_rect(terrain.colors(5)[3]),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
coord_equal()
Now, to illustrate the application for sample sizes
Generation of different samples for sample size comparisons
set.seed(500)
field1.sample<-sample(x=field1, size=25)
field1.sample
## [1] 0 3 3 1 2 3 1 2 1 4 2 2 0 2 3 0 3 3 1 2 3 3 2 2 0
First, let’s calculate disease incidence as a proportion
(field1.sampleB<-field1.sample/10)
## [1] 0.0 0.3 0.3 0.1 0.2 0.3 0.1 0.2 0.1 0.4 0.2 0.2 0.0 0.2 0.3 0.0 0.3 0.3 0.1
## [20] 0.2 0.3 0.3 0.2 0.2 0.0
(field1.mean<-mean(field1.sampleB))
## [1] 0.192
(field1.var<-var(field1.sampleB) )#Variance based on normal distribution
## [1] 0.01326667
(field1.varbin<-(field1.mean*(1-field1.mean))/10)
## [1] 0.0155136
Compare variances = index of dispersion (D); values close to 1 indicate patterns not distinguishable from random; much larger than 1 indicate patterns of aggregation (formal tests exist as this is merely for illustration)
field1.var/field1.varbin
## [1] 0.8551636
Sample size calculations for CV = 0.1 and 0.2, respectively
N1<-(1-field1.mean)/(10*field1.mean*0.1^2)
N1
## [1] 42.08333
N2<-(1-field1.mean)/(10*field1.mean*0.2^2)
N2
## [1] 10.52083
Exercise: Draw a new sample of 25 and run the same calculations to determine new sample sizes based on CV = 0.1 and CV = 0.2
This code, when copied and pasted, will duplicate the information needed to replicate the example in the notes pertaining to cluster sampling for disease incidence data:
Note: this distribution is rather flexible and takes on many different forms depending on the parameters, for our illustration, emphasis will be placed on generating an over-dispersed example to illustrate the calculations. Here, prob=0.35 represent the probability a given plant would be infected, based on Bernoulli trials and subsequently the beta-distribution
set.seed(10000)
field2<-rbetabinom(n=300, prob=0.35, size=10, shape1=0.2, shape2=2)
field2.matrix<-matrix(field2,nrow=15,ncol=20)
field2.matrix
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 3 0 1 0 0 0 0 2 0 0 0 0 0
## [2,] 5 3 0 3 0 2 1 1 0 0 0 0 0
## [3,] 1 3 0 0 1 0 0 0 7 1 0 0 0
## [4,] 0 0 0 0 1 0 0 1 0 0 0 0 1
## [5,] 0 0 0 2 2 1 0 0 0 0 3 0 0
## [6,] 7 3 0 0 0 0 0 0 0 0 0 6 0
## [7,] 0 0 0 0 0 3 2 0 1 0 0 0 0
## [8,] 0 0 0 2 1 0 0 0 0 0 0 0 2
## [9,] 0 0 4 2 0 1 0 0 0 4 1 0 0
## [10,] 0 5 0 0 0 0 1 2 0 3 2 8 0
## [11,] 0 1 0 0 0 0 0 0 3 0 7 6 0
## [12,] 10 0 0 0 0 1 0 4 1 1 0 4 0
## [13,] 1 1 0 1 0 1 1 0 0 1 4 0 0
## [14,] 0 0 0 0 1 1 0 0 1 0 0 0 0
## [15,] 3 1 0 0 1 0 1 0 0 0 0 0 5
## [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 0 1 0 1 1 0 0
## [2,] 0 2 0 0 0 0 5
## [3,] 0 1 0 1 0 1 0
## [4,] 0 3 0 0 0 0 1
## [5,] 0 0 0 0 4 0 0
## [6,] 0 0 0 8 0 4 0
## [7,] 0 0 5 0 0 0 0
## [8,] 0 0 0 0 5 0 0
## [9,] 0 0 0 1 0 1 0
## [10,] 1 6 0 0 0 0 0
## [11,] 0 2 1 4 1 0 1
## [12,] 0 0 2 0 0 0 0
## [13,] 0 0 0 0 0 0 1
## [14,] 0 0 1 0 0 0 0
## [15,] 0 0 0 0 0 0 0
map1_long <-
reshape2::melt(field2.matrix, varnames = c("rows", "cols"))
# See first rows onf melted matrix
head(map1_long)
## rows cols value
## 1 1 1 3
## 2 2 1 5
## 3 3 1 1
## 4 4 1 0
## 5 5 1 0
## 6 6 1 7
point_size <- 11
map1_long %>%
ggplot(aes(factor(cols), factor(rows),
color = value,
label = as.character(value)))+
geom_point(size = point_size, shape = 15)+
geom_text(colour = "black")+
scale_color_gradient("Health status (counts of infected plants):",
low = "lightgreen", high = "#993300")+
theme_bw()+
theme(axis.title = element_blank(),
legend.position = "top",
panel.background = element_rect(terrain.colors(5)[3]),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
coord_equal()
Generate a sample of size 25 that represents the pilot study.
set.seed(602)
field2.sample<-sample(x=field2, size=25)
field2.sample
## [1] 0 0 0 5 0 0 0 0 0 2 0 0 1 0 0 0 0 0 0 3 0 3 1 0 0
First, let’s calculate disease incidence as a proportion followed by a comparison of the variances
field2.sampleB<-field2.sample/10
field2.mean<-mean(field2.sampleB)
field2.var<-var(field2.sampleB) #Variance based on normal distribution
field2.varbin<-(field2.mean*(1-field2.mean))/10
Index of dispersion (D)
var.relation<-field2.var/field2.varbin
Question: What does index of dispersion stand for?
N1<-(1-field1.mean)/(10*field1.mean*0.1^2)
N1
## [1] 42.08333
N2<-(1-field1.mean)/(10*field1.mean*0.2^2)
N2
## [1] 10.52083
Exercise: Draw a new sample of 25 and run the same calculations to
determine new sample sizes based on CV = 0.1 and CV = 0.2
Hint: set.seed()
Based on previous knowledge of the Power law relationship, values were calculated to represent to two parameters, A and B (see Madden et al. for more details) Parameters: A=16; B=1.4
N1<-(1-field1.mean)/(10*field1.mean*0.1^2)
N1
## [1] 42.08333
N2<-(1-field1.mean)/(10*field1.mean*0.2^2)
N2
## [1] 10.52083
Define parameters:
A<-16
B<-1.4
CV1 <- .1
CV2 <- .2
Calculate the sample size.
(N.power1<-(A*field2.mean^(B-2)*(1-field2.mean)^B)/(10^B*CV1^2))
## [1] 315.9426
(N.power2<-(A*field2.mean^(B-2)*(1-field2.mean)^B)/(10^B*CV2^2))
## [1] 78.98566
Based on design effect (deff) and the beta-binomial distribution calculations. We use here the var.relation to represent the empirical heterogeneity factor (deff)
deff = 2.54
CV1 <- .1
CV2 <- .2
N.betabinom1 <- ((1 - field2.mean) * deff) / (10 * field2.mean * CV1 ^ 2)
N.betabinom2 <- ((1 - field2.mean) * deff) / (10 * field2.mean * CV2 ^ 2)
SRS.PROPORTION.1 <- function (mean,z,H) {
Nest<-((1-mean)/mean)*(z/H)^2
names(Nest)<-"SampleSize"
return(Nest)
}
gg <-
as.data.frame(expand.grid(
mean = seq(.05, .3, .05),
z = 1.96,
H = seq(.1, .5, .05)
)) %>%
mutate(SampleSize = SRS.PROPORTION.1(mean, z, H)) %>%
ggplot(aes(H, SampleSize, group = mean, color = mean)) +
geom_point(size = .5) +
geom_line() +
scale_color_viridis_c() +
theme_bw()
plotly::ggplotly(gg)
EstNfcf <- function (M,mean,z,H) {
return(((1-mean)/mean)*(z/H)^2)
}
EstFcf <- function (M,mean,z,H) {
Nest<-((1-mean)/mean)*(z/H)^2
finite <-Nest/M
Nsample <-ifelse(finite>0.1,Nest/(1+(Nest/M)),Nest)
return(round(Nsample,digits=1))
}
NestM <- function (M,mean,z,H) {
Nest <-((1-mean)/mean)*(z/H)^2
finite <-Nest/M
return(round(finite, digits=2))
}
gg <-
as.data.frame(expand.grid(
mean = seq(.05, .3, .05),
z = 1.96,
H = seq(.1,.5,.05),
M =1000
) ) %>%
mutate(
est_nfcf = EstNfcf(M,mean,z,H),
est_fcf = EstFcf(M,mean,z,H),
nest_m = NestM(M,mean,z,H)
) %>%
mutate("NEST/M" = nest_m ,
"Estimated-FCF" = est_fcf,
"Estimated-NFCF" = est_nfcf) %>%
pivot_longer(cols = c("Estimated-NFCF", "Estimated-FCF", "NEST/M")) %>%
ggplot(aes(H, value, group = mean, color = mean))+
geom_point(size = .5)+
geom_line()+
scale_color_viridis_c()+
facet_wrap(~name, ncol = 1, scales = "free_y")+
theme_bw()+
theme(legend.position = "top")
plotly::ggplotly(gg) %>%
plotly::layout(legend = list(orientation = "h", x = 0.4, y = -0.2))
h represents the half length of a confidence interval based on a fixed positive number
SRS.PROPORTION.1 <- function (mean,z,h) {
Nest<-(mean*(1-mean))*(z/h)^2
names(Nest)<-"SampleSize"
return(Nest)
}
gg <-
as.data.frame(expand.grid(
mean = seq(.05, .3, .05),
z = 1.96,
h = seq(.1,.5,.05)
) ) %>%
mutate(
SampleSize = SRS.PROPORTION.1(mean,z,h)
) %>%
ggplot(aes(h, SampleSize, group = mean, color = mean))+
geom_point(size = .5)+
geom_line()+
scale_color_viridis_c()+
theme_bw()
plotly::ggplotly(gg)
The following section has functions to calculate sample sizes based on Madden et al. (2007), table 10. Note, for the following, the finite population is not applied; this could be handled using direct calculations. Based on known number of sampling units, etc., or code could be modified accordingly.
EstNfcf <- function (mean, n, CV, M) {
return(round((1 - mean) / (n * mean * CV ^ 2), 2))
}
EstFcf <- function (mean, n, CV, M) {
Nest <- (1 - mean) / (n * mean * CV ^ 2)
finite <- Nest / M
Nsample <- ifelse(finite > 0.1, Nest / (1 + (Nest / M)), Nest)
return(round(Nsample, digits = 1))
}
NestM <- function (mean, n, CV, M) {
Nest <- (1 - mean) / (n * mean * CV ^ 2)
finite <- Nest / M
return(round(finite, digits = 2))
}
gg <-
as.data.frame(expand.grid(
mean =seq(.1, .9, .1),
n = 10,
CV =seq(.05, .3, .01),
M = c(100, 500, 1000)
)) %>%
mutate(
est_nfcf = EstNfcf(mean, n, CV, M),
est_fcf = EstFcf(mean, n, CV, M),
nest_m = NestM(mean, n, CV, M)
) %>%
mutate("NEST/M(X100)" = nest_m *100,
"Estimated-FCF" = est_fcf,
"Estimated-NFCF" = est_nfcf) %>%
pivot_longer(cols = c("Estimated-NFCF", "Estimated-FCF", "NEST/M(X100)")) %>%
ggplot(aes(CV, value, group = mean, color = mean))+
geom_point(size = .5)+
geom_line()+
scale_color_viridis_c()+
facet_grid(M~name)+
theme_bw()+
theme(legend.position = "top")
plotly::ggplotly(gg) %>%
plotly::layout(legend = list(orientation = "h", x = 0.4, y = -0.2))
CLUSTER.RANDOM.2 <- function (mean, n, z, H) {
Nest<-((1-mean)/(n*mean))*(z/H)^2
names(Nest)<-"SampleSize"
return(Nest)
}
gg <-
as.data.frame(expand.grid(
mean = seq(.05, .3, .05),
n = 10,
z = 1.96,
H = seq(.1, .5, .05)
)) %>%
mutate(SampleSize = CLUSTER.RANDOM.2(mean, n, z, H)) %>%
ggplot(aes(H, SampleSize, group = mean, color = mean)) +
geom_point(size = .5) +
geom_line() +
scale_color_viridis_c() +
theme_bw()
plotly::ggplotly(gg)
CLUSTER.RANDOM.3 <- function(mean, n, z, h) {
Nest <- ((mean * (1 - mean)) / n) * (z / h) ^ 2
names(Nest) <- "SampleSize"
return(Nest)
}
gg <-
as.data.frame(expand.grid(
mean = seq(.05, .3, .05),
n = 10,
z = 1.96,
h = seq(.01,.1,.01)
) ) %>%
mutate(
SampleSize = CLUSTER.RANDOM.3(mean,n, z,h)
) %>%
ggplot(aes(h, SampleSize, group = mean, color = mean))+
geom_point(size = .5)+
geom_line()+
scale_color_viridis_c()+
theme_bw()
plotly::ggplotly(gg)